In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os
%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)
In [2]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
test = pd.read_csv("../data/test.csv")
test_id = test["id"]
In [40]:
# use_col = ["is_booking", "user_id", "date_time","user_location_country","orig_destination_distance", "srch_co","srch_ci","user_location_region",\
# "hotel_market","srch_destination_id","hotel_cluster"]
In [61]:
train.columns
Out[61]:
In [62]:
test.columns # id가 생기고 hotel_cluster / is_booking / cnt 가 사라짐
Out[62]:
In [3]:
use_col2 = ["user_id", "date_time","user_location_country","orig_destination_distance", "srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id"]
In [4]:
train_y = train[["hotel_cluster"]]
train = train[use_col2]
test = test[use_col2]
In [88]:
In [5]:
%%time
le = preprocessing.LabelEncoder()
# train.fillna(0)
# test.fillna(0)
train["date_time"] = pd.to_datetime(train["date_time"], errors="coerce")
train["date_time"] = train["date_time"].dt.date
train["srch_ci"] = pd.to_datetime(train["srch_ci"], errors="coerce")
train["srch_co"] = pd.to_datetime(train["srch_co"], errors="coerce")
train["date_time"] = le.fit_transform(train["date_time"])
train["srch_ci"] = le.fit_transform(train["srch_ci"])
train["srch_co"] = le.fit_transform(train["srch_co"])
train["orig_destination_distance"].fillna(0, inplace=True)
test["date_time"] = pd.to_datetime(test["date_time"], errors="coerce")
test["date_time"] = test["date_time"].dt.date
test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["date_time"] = le.fit_transform(test["date_time"])
test["srch_ci"] = le.fit_transform(test["srch_ci"])
test["srch_co"] = le.fit_transform(test["srch_co"])
test["orig_destination_distance"].fillna(0, inplace=True)
In [12]:
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
In [16]:
%%time
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
In [32]:
preds[:,:5]
Out[32]:
In [49]:
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)])
In [57]:
result_df = result_df.rename(index=str, columns={0:"hotel_cluster"})
In [72]:
result_df = result_df.reset_index()
result_df = result_df.rename(index=str,columns={"index":"id"})
In [81]:
result_df1 = pd.read_csv("201702061420.csv", index_col="id").drop(["Unnamed: 0"], axis=1)
In [83]:
result_df1.to_csv("201702061422.csv")
In [86]:
result_df1.tail()
Out[86]:
public score = 0.14201
In [110]:
print("="*20)
trn_x1 = train
trn_y1 = train_y
# model = RandomForestClassifier(max_depth=3, n_jobs=-1, random_state=402)
# model.fit(trn_x1,trn_y1)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
rank_series = pd.Series([])
for f in range(trn_x1.shape[1]):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x1.columns[indices[f]], importances[indices[f]]))
# rank_series = rank_series.append(pd.Series([trn_x1.columns[indices[f]], importances[indices[f]]]))
# rank_df2.insert(len(rank_df2.columns), column=i ,value=rank_series)
plt.title("Feature importances")
plt.bar(range(trn_x1.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x1.shape[1]), indices)
plt.xlim([-1, trn_x1.shape[1]])
plt.show()
In [ ]:
In [2]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
print("resd the train.csv")
use_col3 = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']
train_y = train[["hotel_cluster"]]
train = train[use_col3]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col3]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)
In [8]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(train.shape[1]):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], train.columns[indices[f]], importances[indices[f]]))
plt.title("Feature importances")
plt.bar(range(train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(train.shape[1]), indices)
plt.xlim([-1, train.shape[1]])
plt.show()
In [18]:
In [38]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
In [2]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
print("read the train.csv")
use_col3 = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']
train_y = train[["hotel_cluster"]]
train = train[use_col3]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col3]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
print("save file")
result_df.to_csv(os.path.join('../output',file_name), index=True)
In [6]:
train.head()
Out[6]:
In [ ]: